# REPLICATION CODE FOR: Lupu, Noam, Lucia Selios, and Zach Warner. 2016. "A
# New Measure of Congruence: The Earth Mover's Distance." Political Analysis
# (forthcoming).
#
# Last edited 10-Nov-2016 by ZW. For questions please contact me at
# zwarner@wisc.edu.

################################################################################
#                                                                              #
#                                 SETUP                                        #
#                                                                              #
################################################################################
rm(list=ls())
library(ggplot2); library(grid); library(sfsmisc); library(emdist)
library(clusterGeneration); library(foreign); library(plyr); library(sandwich) 
library(lmtest); library(reshape)
setwd("/your/path/to/this/replication")

### version control
sessionInfo()
# R version 3.3.0 (2016-05-03)
# Platform: x86_64-apple-darwin13.4.0 (64-bit)
# Running under: OS X 10.11.6 (El Capitan)
# 
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
# [1] grid      stats     graphics  grDevices utils     datasets  methods  
# [8] base     
# 
# other attached packages:
# [1] reshape_0.8.6           lmtest_0.9-34           zoo_1.7-13             
# [4] sandwich_2.3-4          plyr_1.8.4              foreign_0.8-67         
# [7] clusterGeneration_1.3.4 MASS_7.3-45             emdist_0.3-2           
# [10] sfsmisc_1.1-0           ggplot2_2.1.0          
# 
# loaded via a namespace (and not attached):
# [1] Rcpp_0.12.7      lattice_0.20-34  gtable_0.2.0     scales_0.4.0    
# [5] tools_3.3.0      munsell_0.4.3    rsconnect_0.5    colorspace_1.2-7  

################################################################################
#                                                                              #
#                                 UTILITIES                                    #
#                                                                              #
################################################################################

##### Helper functions #####
# for multiple ggplot objects
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  if (numPlots==1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
# to simulate from a bimodal distribution
bimodalDistFunc <- function (n,cpct, mu1, mu2, sig1, sig2) {
  y0 <- rnorm(n,mean=mu1, sd = sig1)
  y1 <- rnorm(n,mean=mu2, sd = sig2)
  
  flag <- rbinom(n,size=1,prob=cpct)
  y <- y0*(1 - flag) + y1*flag 
}
# R^2 approximation from glm objects
r2.corr.mer <- function(m) {
  lmfit <-  lm(model.response(model.frame(m)) ~ fitted(m))
  summary(lmfit)$r.squared
} 
# for clustered SEs
cl <- function(dat,fm, cluster){
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) 
}

##### Helper functions for computing congruence #####
# Difference in means
dmeans <- function(data){
  df <- data
  x <- as.matrix(df$samps[which(df$pop == "x")])
  y <- as.matrix(df$samps[which(df$pop == "y")])
  abs(mean(x,na.rm=T)-mean(y,na.rm=T))
}
# CDF overlap
cdf.overlap <- function(data, q = 512) { 
  # no default length for sequences, but we'll use density's 512
  df <- data
  x <- as.matrix(na.omit(df$samps[which(df$pop == "x")]))
  y <- as.matrix(na.omit(df$samps[which(df$pop == "y")]))
  Fx <- ecdf(x); Fy <- ecdf(y)
  lower <- min(x,y) 
  upper <- max(x,y)
  z <- seq(lower,upper,((upper-lower)/q))
  Fxz <- Fx(z); Fyz <- Fy(z) 
  sum(abs(Fxz-Fyz))
}
# PDF overlap
pdf.overlap <- function(data, q = 512){ # q = 512 is the default for density()
  df <- data
  fx <- as.vector(na.omit(df$samps[which(df$pop == "x")]))
  fy <- as.vector(na.omit(df$samps[which(df$pop == "y")]))
  lower <- min(c(fx, fy))
  upper <- max(c(fx, fy))
  dx <- density(fx, from=lower, to=upper, n = q)
  dy <- density(fy, from=lower, to=upper, n = q)
  d <- data.frame(location=dx$x, x.den=dx$y, y.den=dy$y)
  d$intersect <- pmin(d$x.den, d$y.den)
  integrate.xy(d$location, d$intersect)
}
# EMD
emd.dis <- function(data, metric = "manhattan", iterations=100000){
  df <- data
  x <- as.matrix(df$samps[which(df$pop == "x")])
  y <- as.matrix(df$samps[which(df$pop == "y")])
  weight.x <- rep(1/nrow(x),nrow(x))
  weight.y <- rep(1/nrow(y),nrow(y))
  emdw(x,weight.x,y,weight.y,dist=metric,max.iter=iterations)
}

################################################################################
#                                                                              #
#                               INTRODUCTION                                   #
#                                                                              #
################################################################################

##### SIMULATED COMPUTATION TIME (FN 3) #####
# generate the data
x <- genRandomClust(numClust=7,numNonNoisy = 10, numNoisy = 5, numOutlier = .02,
                    rangeN=c(150,500), numReplicate = 1,fileName="x", 
                    outputLogFlag=F, outputEmpirical=F, outputInfo = F, 
                    outputDatFlag = F)
y <- genRandomClust(numClust=3,numNonNoisy = 10, numNoisy = 5, numOutlier = .08, 
                    rangeN=c(150,500), numReplicate = 1,fileName="y", 
                    outputLogFlag=F, outputEmpirical=F, outputInfo = F, 
                    outputDatFlag = F)
x <- x$datList$x_1
y <- y$datList$y_1
w.x <- rep(1/nrow(x),nrow(x))
w.y <- rep(1/nrow(y),nrow(y))
# time it
system.time(
  emd.bigprob <- emdw(x,w.x,y,w.y,max.iter=100000,dist="manhattan")
)
# Results for my machine (mid-2012 MacBook Air, 2 GHz Intel Core i7 processor, 
# 8 GB 1600 MHz DDR3 memory)
# user      system    elapsed 
# 500.493   4.536     511.266 

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl"))])

################################################################################
#                                                                              #
#                            MEASURING CONGRUENCE                              #
#                                                                              #
################################################################################

### Create the data
set.seed(1234567)
dat.x.y1 <- data.frame(samps = c(rnorm(1000, mean=0, sd=1), 
                                 rnorm(1000, mean=3, sd=1)), 
                       pop = rep(c("x", "y"), each = 1000))
dat.x.y3 <- dat.x.y2 <- dat.x.y1
dat.x.y2$samps[which(dat.x.y2$pop == "y")] <- rnorm(1000,0,8)
dat.x.y3$samps[which(dat.x.y3$pop == "y")] <- bimodalDistFunc(1000,.35,2.5,
                                                              13.1,1,2)

##### FIGURE 1 #####
fig1.1 <- ggplot(dat.x.y1, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(), axis.text.x = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[1]",family="Times", parse=T,size=5) +
  labs(title=NULL,x=NULL,y="Density") +
  theme(text=element_text(family="Times"))

fig1.2 <- ggplot(dat.x.y2, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(), axis.text.x = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[2]",family="Times", parse=T,size=5) +
  labs(title=NULL,x=NULL,y="Density") +
  theme(text=element_text(family="Times"))

fig1.3 <- ggplot(dat.x.y3, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[3]",family="Times", parse=T,size=5) +
  labs(title=NULL,x="z",y="Density") +
  theme(text=element_text(family="Times"))

pdf("figure1.pdf",
    width=6.5,height=9)
grid.newpage()
grid.draw(rbind(ggplotGrob(fig1.1), 
                ggplotGrob(fig1.2), 
                ggplotGrob(fig1.3),size="last"))
dev.off()

################################################################################
#                                                                              #
#                             SIMULATION EVIDENCE                              #
#                                                                              #
################################################################################

##### TABLE 1 #####
### Difference of means distance
o2o.1 <- dmeans(dat.x.y1)
o2o.2 <- dmeans(dat.x.y2)
o2o.3 <- dmeans(dat.x.y3)

### CDF overlap distance
cdf.1 <- cdf.overlap(dat.x.y1)
cdf.2 <- cdf.overlap(dat.x.y2)
cdf.3 <- cdf.overlap(dat.x.y3)

### PDF overlap distance
pdf.1 <- pdf.overlap(dat.x.y1)
pdf.2 <- pdf.overlap(dat.x.y2)
pdf.3 <- pdf.overlap(dat.x.y3)

### EMD
emd.1 <- emd.dis(dat.x.y1)
emd.2 <- emd.dis(dat.x.y2)
emd.3 <- emd.dis(dat.x.y3)

### print the table
tab <- matrix(c(o2o.1,o2o.2,o2o.3,
                cdf.1,cdf.2,cdf.3,
                pdf.1,pdf.2,pdf.3,
                emd.1,emd.2,emd.3),ncol=4)
dimnames(tab) <- list(c("Top","Middle","Bottom"),
                      c("Diff in Means","CDF overlap","PDF Overlap","EMD"))
round(tab, 2)

##### FIGURE 3 #####
### size of each sample to compare
sizedist = 100 
### number of samples to compare
numsamps = 1000 
### create target distribution, a standard-normal
target <- data.frame(samps =  rnorm(sizedist,0,1), 
                     pop = rep("x", sizedist))

### generate means
means <-rnorm(numsamps,10,10)
# create dataframes to store results
df.meanshift <- data.frame(matrix(ncol=0,nrow=numsamps))
df.meanshift$means <- means
df.meanshift$emd <- df.meanshift$pdf <- df.meanshift$cdf <- 
  df.meanshift$diffmeans <-rep(NA,numsamps)
samps.list <- lapply(1:length(means),  
                     function(i) data.frame(matrix(nrow=sizedist,ncol=0)))
samps.df.means <- data.frame(matrix(nrow=sizedist,ncol=numsamps))
# create samples from the random means, then calculate distances and store them
for(i in 1:numsamps){
  print(i)
  # sample from the simulated mean
  samps.list[[i]]$samps <- rnorm(n=sizedist, mean = means[i], sd = 1)
  samps.df.means[,i] <- samps.list[[i]]$samps
  samps.list[[i]]$pop <- rep("y",sizedist)
  # bind the target dist
  samps.list[[i]] <- rbind(samps.list[[i]],target)
  # calculate congruence in various ways
  df.meanshift$diffmeans[i] <- dmeans(samps.list[[i]])
  df.meanshift$cdf[i] <- cdf.overlap(samps.list[[i]])
  df.meanshift$pdf[i] <- pdf.overlap(samps.list[[i]])
  df.meanshift$emd[i] <- emd.dis(samps.list[[i]],iterations = 100000)
}

### generate variances
variances <-runif(numsamps,1,10)
# create dataframes to store results
df.variances <- data.frame(matrix(ncol=0,nrow=numsamps))
df.variances$variances <- variances
df.variances$emd <- df.variances$pdf <- df.variances$cdf <- 
  df.variances$diffmeans <-rep(NA,numsamps)
samps.list <- lapply(1:length(variances),  
                     function(i) data.frame(matrix(nrow=sizedist,ncol=0)))
samps.df.vars <- data.frame(matrix(nrow=sizedist,ncol=numsamps))
# create samples from the random means, then calculate distances and store them
for(i in 1:numsamps){
  print(i)
  # sample from the simulated mean
  samps.list[[i]]$samps <- rnorm(n=sizedist, mean = 0, sd = sqrt(variances[i]))
  samps.df.vars[,i] <- samps.list[[i]]$samps
  samps.list[[i]]$pop <- rep("y",sizedist)
  # bind the target dist
  samps.list[[i]] <- rbind(samps.list[[i]],target)
  # calculate congruence in various ways
  df.variances$diffmeans[i] <- dmeans(samps.list[[i]])
  df.variances$cdf[i] <- cdf.overlap(samps.list[[i]])
  df.variances$pdf[i] <- pdf.overlap(samps.list[[i]])
  df.variances$emd[i] <- emd.dis(samps.list[[i]],iterations = 100000)
}

### Get the means data into shape for plotting
# diff-means
d1 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$diffmeans)))
colnames(d1) <- c("x","y")
d1$x <- d1$x/max(d1$x); d1$y <- d1$y/max(d1$y)
d1$true.rank <-  NA; d1$true.rank[order(d1$x)] <- 1:nrow(d1)
d1$calc.rank <-  NA; d1$calc.rank[order(d1$y)] <- 1:nrow(d1)
# CDF overlap
d2 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$cdf)))
colnames(d2) <- c("x","y")
d2$x <- d2$x/max(d2$x); d2$y <- d2$y/max(d2$y)
d2$true.rank <-  NA; d2$true.rank[order(d2$x)] <- 1:nrow(d2)
d2$calc.rank <-  NA; d2$calc.rank[order(d2$y)] <- 1:nrow(d2)
# PDf overlap (slightly different)
d3 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$pdf)))
colnames(d3) <- c("x","y")
d3$x <- d3$x/max(d3$x)
# the next line inverts distance so closer to 0 means "more similar,"
# like other measures
d3$y <- 1/d3$y
d3$y <- d3$y/max(d3$y) # scales it
d3$true.rank <-  NA; d3$true.rank[order(d3$x)] <- 1:nrow(d3)
d3$calc.rank <-  NA; d3$calc.rank[order(d3$y)] <- 1:nrow(d3)
# EMD
d4 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$emd)))
colnames(d4) <- c("x","y")
d4$x <- d4$x/max(d4$x); d4$y <- d4$y/max(d4$y)
d4$true.rank <-  NA; d4$true.rank[order(d4$x)] <- 1:nrow(d4)
d4$calc.rank <-  NA; d4$calc.rank[order(d4$y)] <- 1:nrow(d4)

### Get the variances data ready for plotting
# subtract 1 off variance to account for target variance = 1, as in text
# diff-means
d5 <- as.data.frame(cbind(abs(df.variances$variances-1)
                          ,abs(df.variances$diffmeans)))
colnames(d5) <- c("x","y")
d5$x <- d5$x/max(d5$x); d5$y <- d5$y/max(d5$y)
d5$true.rank <-  NA; d5$true.rank[order(d5$x)] <- 1:nrow(d5)
d5$calc.rank <-  NA; d5$calc.rank[order(d5$y)] <- 1:nrow(d5)
# CDF overlap
d6 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$cdf)))
colnames(d6) <- c("x","y")
d6$x <- d6$x/max(d6$x); d6$y <- d6$y/max(d6$y)
d6$true.rank <-  NA; d6$true.rank[order(d6$x)] <- 1:nrow(d6)
d6$calc.rank <-  NA; d6$calc.rank[order(d6$y)] <- 1:nrow(d6)
# PDf overlap (slightly different)
d7 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$pdf)))
colnames(d7) <- c("x","y")
d7$x <- d7$x/max(d7$x)
# the next line inverts distance so closer to 0 means "more similar,"
# like other measures
d7$y <- 1/d7$y 
d7$y <- d7$y/max(d7$y) # scales it
d7$true.rank <-  NA; d7$true.rank[order(d7$x)] <- 1:nrow(d7)
d7$calc.rank <-  NA; d7$calc.rank[order(d7$y)] <- 1:nrow(d7)
# EMD
d8 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$emd)))
colnames(d8) <- c("x","y")
d8$x <- d8$x/max(d8$x); d8$y <- d8$y/max(d8$y)
d8$true.rank <-  NA; d8$true.rank[order(d8$x)] <- 1:nrow(d8)
d8$calc.rank <-  NA; d8$calc.rank[order(d8$y)] <- 1:nrow(d8)

### plot
fig3.1 <- ggplot(d1) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.2 <- ggplot(d2) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.3 <- ggplot(d3) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True mean rank",
       y="Calculated distance rank",size=16) +
  theme(text=element_text(family="Times"))

fig3.4 <- ggplot(d4) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.5 <- ggplot(d5) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.6 <- ggplot(d6) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.7 <- ggplot(d7) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.8 <- ggplot(d8) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

pdf("figure3.pdf",width=16,height=8)
multiplot(fig3.1,fig3.5,fig3.2,fig3.6,fig3.3,fig3.7,fig3.4,fig3.8, cols=4)
dev.off()

##### TABLE 2 #####
### calculate RMSE for ordinality
rmse.ord <- vector()
for(i in 1:8){
  tmp <- eval(parse(text=paste("d",i,sep="")))
  tmp$sqerr <- (tmp$true.rank - tmp$calc.rank)^2
  rmse.ord[i] <- sqrt(sum(tmp$sqerr)/nrow(tmp))
}
### calculate RMSE for cardinality
rmse.card <- vector()
for(i in 1:8){
  tmp <- eval(parse(text=paste("d",i,sep="")))
  tmp$sqerr <- (tmp$x - tmp$y)^2
  rmse.card[i] <- sqrt(sum(tmp$sqerr)/nrow(tmp))
}

### print the table
rmses <- c(rmse.ord,rmse.card)
rmses <- matrix(rmses,ncol=4,byrow=F)
dimnames(rmses) <- list(c("Diff in Means","CDF overlap","PDF Overlap","EMD"),
                        c("Ord: Mean","Ord: Variance", "Card: Mean",
                          "Card: Variance"))
round(rmses, 2)

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl",
                           "samps.df.means","samps.df.vars","d1","d2","d3",
                           "d4","d5","d6","d7","d8","target"))])

################################################################################
#                                                                              #
#                           EMPIRICAL APPLICATIONS                             #
#                                                                              #
################################################################################

##### TABLE 3: Electoral Rules and Ideological Congruence #####
### import data
df <- read.csv("Lupu-Selios-Warner-GSreplication.csv", stringsAsFactors = F)

### G+S models
# majoritarianism
mod1 <- glm(cdf_cong_40_rep ~ as.factor(majoritarian_gs), data=df)
summary(mod1)
nobs(mod1)
length(unique(df$country[which(!is.na(df$cdf_cong_40_rep) & 
                                 !is.na(df$majoritarian_gs))]))
r2.corr.mer(mod1)
# disproportionality
mod2 <- glm(cdf_cong_40_rep ~ disproportionality_gs, data=df)
summary(mod2)
nobs(mod2)
length(unique(df$country[which(!is.na(df$cdf_cong_40_rep) & 
                                 !is.na(df$disproportionality_gs))]))
r2.corr.mer(mod2)

### Using the EMD on G+S sample
# majoritarianism
df.gs <- df[which(!is.na(df$cdf_cong_40_rep)),]
mod3 <- glm(emd_educit ~ as.factor(majoritarian_gs), data=df.gs)
summary(mod3)
nobs(mod3)
length(unique(df.gs$country[which(!is.na(df.gs$emd_educit) & 
                                 !is.na(df.gs$majoritarian_gs))]))
r2.corr.mer(mod3)
# disproportionality
mod4 <- glm(emd_educit ~ disproportionality_gs, data=df.gs)
summary(mod4)
nobs(mod4)
length(unique(df.gs$country[which(!is.na(df.gs$emd_educit) & 
                                 !is.na(df.gs$disproportionality_gs))]))
r2.corr.mer(mod4)

### Using the EMD on the full sample, with clustered SEs
df.dem <- df[!is.na(df$polity2),]
# G+S drop polity scores below 6, but for our data, this would remove Israel,
# which is in their data. Instead, we drop scores below 5. Note that this does
# not retain any countries that would be dropped at >6 other than Israel.
df.dem <- df.dem[which(df.dem$polity2 > 5),] 
# only use B+G's parliamentary countries
df.dem <- df.dem[which(df.dem$parliamentary_bg == 1),]
# remove NAs now rather than during estimation, so we can cluster SEs
df.mod5 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                     !is.na(df.dem$majoritarian_bg)),]
df.mod6 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                     !is.na(df.dem$disproportionality_gan)),]
# majoritarianism
mod5 <- glm(emd_educit ~ as.factor(majoritarian_bg), data=df.mod5)
nobs(mod5); length(unique(df.mod5$country)); r2.corr.mer(mod5)
cl(df.mod5, mod5, df.mod5$country)
# disproportionality
mod6 <- glm(emd_educit ~ disproportionality_gan, data=df.mod6)
nobs(mod6); length(unique(df.mod6$country)); r2.corr.mer(mod6)
cl(df.mod6, mod6, df.mod6$country)

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl",
                           "samps.df.means","samps.df.vars","d1","d2","d3",
                           "d4","d5","d6","d7","d8","target"))])

##### FIGURE 4: Mass-Elite Congruence in Latin America #####
# import data
emds <- read.csv("Lupu-Selios-Warner-LAPOP-PELA.csv")

# get the variables for which we want to run regressions
vars <- c("dem_age","pres_years_office","party_years_office","party_age",
          "volatility","enep","magnitude","comp_vote","gdp_pc_1000","poverty",
          "hdi","gini")

# run a loop regressing the EMD on each variable and simulating predicted
# effects of moving across the interquartile range
preds <- list()
for(i in vars){
  eval(parse(text=paste("mod.",i," <- glm(emd_all ~ ",i,", data = emds)",
                        sep="")))
  if(i != 'comp_vote'){
    eval(parse(text=paste("X.",i,".lo <- matrix(c(1,summary(emds$",i,")[2]))",
                          sep="")))
    eval(parse(text=paste("X.",i,".hi <- matrix(c(1,summary(emds$",i,")[5]))",
                          sep="")))
  } else {
    eval(parse(text=paste("X.",i,".lo <- matrix(c(1,0))",sep="")))
    eval(parse(text=paste("X.",i,".hi <- matrix(c(1,1))",sep="")))
  }
  eval(parse(text=paste("beta.tilde.",i," <- mvrnorm(1000,coef(mod.",i,
                        "),vcov(mod.",i,"))",sep="")))
  eval(parse(text=paste("sims.",i,".lo <- (beta.tilde.",i," %*% X.",i,
                        ".lo)",sep="")))
  eval(parse(text=paste("sims.",i,".hi <- (beta.tilde.",i," %*% X.",i,
                        ".hi)",sep="")))
  eval(parse(text=paste("sims.",i," <- sims.",i,".hi - sims.",i,
                        ".lo",sep="")))
  eval(parse(text=paste("pred.",i," <- data.frame(t(c(quantile(sims.",i,
                        ", .025),mean(sims.",i,"),quantile(sims.",i,
                        ", .975))))",sep="")))
  eval(parse(text=paste("colnames(pred.",i,") <- c('lower','mean','upper')",
                        sep="")))
  eval(parse(text=paste("pred.",i,"$var <- '",i,"'",sep="")))
  eval(parse(text=paste("preds[[which(vars == i)]] <- pred.",i,sep="")))
}
# store results
preds <- do.call(rbind, preds)
preds$iter <- rev(seq(1,nrow(preds),1))
preds$dv <- "full"
preds_full <- preds

### run the same loop but where ideology is the only dimension
preds <- as.list(NULL)
for(i in vars){
  eval(parse(text=paste("mod.",i," <- glm(emd_ideology ~ ",i,", data = emds)",
                        sep="")))
  if(i != 'comp_vote'){
    eval(parse(text=paste("X.",i,".lo <- matrix(c(1,summary(emds$",i,")[2]))",
                          sep="")))
    eval(parse(text=paste("X.",i,".hi <- matrix(c(1,summary(emds$",i,")[5]))",
                          sep="")))
  } else {
    eval(parse(text=paste("X.",i,".lo <- matrix(c(1,0))",sep="")))
    eval(parse(text=paste("X.",i,".hi <- matrix(c(1,1))",sep="")))
  }
  eval(parse(text=paste("beta.tilde.",i," <- mvrnorm(1000,coef(mod.",i,
                        "),vcov(mod.",i,"))",sep="")))
  eval(parse(text=paste("sims.",i,".lo <- (beta.tilde.",i," %*% X.",i,".lo)",
                        sep="")))
  eval(parse(text=paste("sims.",i,".hi <- (beta.tilde.",i," %*% X.",i,".hi)",
                        sep="")))
  eval(parse(text=paste("sims.",i," <- sims.",i,".hi - sims.",i,".lo",sep="")))
  eval(parse(text=paste("pred.",i," <- data.frame(t(c(quantile(sims.",i,
                        ", .025),mean(sims.",i,"),quantile(sims.",i,
                        ", .975))))",sep="")))
  eval(parse(text=paste("colnames(pred.",i,") <- c('lower','mean','upper')",
                        sep="")))
  eval(parse(text=paste("pred.",i,"$var <- '",i,"'",sep="")))
  eval(parse(text=paste("preds[[which(vars == i)]] <- pred.",i,sep="")))
}
# store the results
preds <- do.call(rbind, preds)
preds$iter <- rev(seq(0.8,nrow(preds)-.2,1))
preds$dv <- "ideology"

### merge the two loops
preds_full <- rbind(preds_full,preds)
preds_full$cov <- ifelse(preds_full$lower < 0 & preds_full$upper > 0,1,2)
varlabs <- c("Age of democracy","Years president in office",
             "Years party in office","Average party age","Electoral volatility",
             "Effective number of parties","Average district magnitude",
             "Compulsory voting","GDP per capita","Poverty rate",
             "Human Development Index","Inequality")

### plot
fig4 <- ggplot(preds_full, aes(x=mean,y=iter)) + 
  theme_bw() + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank()) +
  geom_vline(xintercept=0,linetype="longdash",colour="darkgray") +
  geom_segment(data=preds_full,aes(x=lower,xend=upper,y=iter,yend=iter,
                                   colour=factor(cov),linetype=factor(dv)),
               size=.5,show.legend=F) +
  geom_point(shape=21,aes(x=mean,fill=factor(cov)),size=2,show.legend=F) +
  scale_fill_manual(values=c("white", "black")) +
  scale_colour_manual(values=c("black", "black")) +
  scale_y_continuous(limits = c(.78,12), breaks = c(1:12),labels=rev(varlabs)) +
  theme(axis.text.y = element_text(size = rel(1.2))) +
  labs(title=NULL,x="Change in EMD",y=NULL) +
  theme(text=element_text(family="Times"))

pdf("figure4.pdf",width=6,height=6)
fig4
dev.off() 

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl",
                           "samps.df.means","samps.df.vars","d1","d2","d3",
                           "d4","d5","d6","d7","d8","target","vars"))])

################################################################################
#                                                                              #
#                                APPENDIX                                      #
#                                                                              #
################################################################################

##### FIGURE A1 #####
figA1.1 <- ggplot(d1) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True mean (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.2 <- ggplot(d2) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True mean (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.3 <- ggplot(d3) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True mean (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.4 <- ggplot(d4) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True mean (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.5 <- ggplot(d5) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True variance (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.6 <- ggplot(d6) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True variance (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.7 <- ggplot(d7) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True variance (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))
figA1.8 <- ggplot(d8) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=x,y=y)) +
  coord_cartesian(xlim=c(0,1),ylim=c(0,1)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True variance (scaled)",
       y="Calculated distance (scaled)",
       size=16) +
  theme(text=element_text(family="Times"))

pdf("appendix-figure1.pdf",width=16,height=8)
multiplot(figA1.1,figA1.5,figA1.2,figA1.6,figA1.3,figA1.7,figA1.4,figA1.8, 
          cols=4)
dev.off()

##### FIGURE A2 #####
long.means <- melt(samps.df.means)
long.vars <- melt(samps.df.vars)

figA2.1 <- ggplot(long.means, aes(x=value)) + 
  theme_bw() +
  geom_density(aes(group=variable),color="gray55",size=.01) +
  geom_density(data=target, aes(x=samps),size=.5) +
  coord_cartesian(xlim=c(-21,45),ylim=c(0,0.65)) +
  labs(title=NULL,x=NULL,y="Density") +
  theme(text=element_text(family="Times"))
figA2.2 <- ggplot(long.vars, aes(x=value)) + 
  theme_bw() +
  geom_density(aes(group=variable),color="gray55",size=.01) +
  geom_density(data=target, aes(x=samps),size=.5)  +
  coord_cartesian(xlim=c(-21,45),ylim=c(0,0.65)) +
  labs(title=NULL,x="z",y="Density") +
  theme(text=element_text(family="Times"))

pdf("appendix-figure2.pdf",width=6,height=6)
multiplot(figA2.1,figA2.2,cols=1)
dev.off()

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl",
                           "vars"))])

##### TABLE A1 #####
### read in the two datasets
# top half
df <- read.csv("Lupu-Selios-Warner-GSreplication.csv", stringsAsFactors = F)
round(summary(df$emd_educit),2)
round(summary(df$cdf_cong_40_rep),2)
round(summary(df$majoritarian_gs),2)
round(summary(df$majoritarian_bg),2)
round(summary(df$disproportionality_gs),2)
round(summary(df$disproportionality_gan),2)
# bottom half
emds <- read.csv("Lupu-Selios-Warner-LAPOP-PELA.csv", stringsAsFactors = F)
round(summary(emds$emd_all),2)
for(i in vars){
  eval(parse(text=paste("print(round(summary(emds$",i,"),2))",sep="")))
}
# NB: Obs is just nrow(df) or nrow(emds) minus the number of NAs

##### TABLE A2 #####
### EMD on full sample *without* clustered SEs
df.dem <- df[!is.na(df$polity2),]
df.dem <- df.dem[which(df.dem$polity2 > 5),] 
df.dem <- df.dem[which(df.dem$parliamentary_bg == 1),]
df.mod5 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                          !is.na(df.dem$majoritarian_bg)),]
df.mod6 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                          !is.na(df.dem$disproportionality_gan)),]
# majoritarianism
mod5 <- glm(emd_educit ~ as.factor(majoritarian_bg), data=df.mod5)
nobs(mod5); length(unique(df.mod5$country)); r2.corr.mer(mod5)
summary(mod5)
# disproportionality
mod6 <- glm(emd_educit ~ disproportionality_gan, data=df.mod6)
nobs(mod6); length(unique(df.mod6$country)); r2.corr.mer(mod6)
summary(mod6)

### EMD on full sample, using *all citizens* instead of educated ones only
df.dem <- df[!is.na(df$polity2),]
df.dem <- df.dem[which(df.dem$polity2 > 5),] 
df.dem <- df.dem[which(df.dem$parliamentary_bg == 1),]
df.mod5 <- df.dem[which(!is.na(df.dem$emd_allcit) & 
                          !is.na(df.dem$majoritarian_bg)),]
df.mod6 <- df.dem[which(!is.na(df.dem$emd_allcit) & 
                          !is.na(df.dem$disproportionality_gan)),]
# majoritarianism
mod5 <- glm(emd_allcit ~ as.factor(majoritarian_bg), data=df.mod5)
nobs(mod5); length(unique(df.mod5$country)); r2.corr.mer(mod5)
cl(df.mod5, mod5, df.mod5$country)
# disproportionality
mod6 <- glm(emd_allcit ~ disproportionality_gan, data=df.mod6)
nobs(mod6); length(unique(df.mod6$country)); r2.corr.mer(mod6)
cl(df.mod6, mod6, df.mod6$country)

### EMD on full sample, *using strict parliamentary coding from B+G*
df.dem <- df[!is.na(df$polity2),]
df.dem <- df.dem[which(df.dem$polity2 > 5),]
df.dem <- df.dem[which(df.dem$parliamentary_bg_alt == 1),]
df.mod5 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                     !is.na(df.dem$majoritarian_bg)),]
df.mod6 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                     !is.na(df.dem$disproportionality_gan)),]
# majoritarianism
mod5 <- glm(emd_educit ~ as.factor(majoritarian_bg), data=df.mod5)
nobs(mod5); length(unique(df.mod5$country)); r2.corr.mer(mod5)
cl(df.mod5, mod5, df.mod5$country)
# disproportionality
mod6 <- glm(emd_educit ~ disproportionality_gan, data=df.mod6)
nobs(mod6); length(unique(df.mod6$country)); r2.corr.mer(mod6)
cl(df.mod6, mod6, df.mod6$country)

### EMD on full sample, *using DPI coding* of variables
df.dem <- df[!is.na(df$polity2),]
df.dem <- df.dem[df.dem$polity2 > 5,]
df.dem <- df.dem[df.dem$parliamentary_dpi == 1,]
df.mod5 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                          !is.na(df.dem$majoritarian_dpi)),]
df.mod6 <- df.dem[which(!is.na(df.dem$emd_educit) & 
                          !is.na(df.dem$disproportionality_gan)),]
# majoritarianism
mod5 <- glm(emd_educit ~ as.factor(majoritarian_dpi), data=df.mod5)
nobs(mod5); length(unique(df.mod5$country)); r2.corr.mer(mod5)
cl(df.mod5, mod5, df.mod5$country)
# disproportionality
mod6 <- glm(emd_educit ~ disproportionality_gan, data=df.mod6)
nobs(mod6); length(unique(df.mod6$country)); r2.corr.mer(mod6)
cl(df.mod6, mod6, df.mod6$country)

##### TABLE A3 #####
emds <- emds[!is.na(emds$emd_all),]
tab <- emds[,c("country","year","emd_all","emd_ideology","emd_samesex",
        "emd_privatization","emd_wellbeing","emd_jobs","emd_inequality",
        "emd_health")]
tab[,c(2:ncol(tab))] <- round(tab[,c(2:ncol(tab))],2)
tab

##### TABLE A5 #####
# run a loop regressing the EMD on each variable and printing results
for(i in vars){
  eval(parse(text=paste("mod.",i," <- glm(emd_all ~ ",i,", data = emds)",
                        sep="")))
  eval(parse(text=paste("print(summary(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(nobs(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(r2.corr.mer(mod.",i,"))",sep="")))
}

##### TABLE A6 #####
for(i in vars){
  eval(parse(text=paste("mod.",i," <- glm(emd_all_euclid ~ ",i,
                        ", data = emds)",sep="")))
  eval(parse(text=paste("print(summary(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(nobs(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(r2.corr.mer(mod.",i,"))",sep="")))
}

##### TABLE A7 #####
for(i in vars){
  eval(parse(text=paste("mod.",i," <- glm(emd_ideology ~ ",i,", data = emds)",
                        sep="")))
  eval(parse(text=paste("print(summary(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(nobs(mod.",i,"))",sep="")))
  eval(parse(text=paste("print(r2.corr.mer(mod.",i,"))",sep="")))
}
